package org.nerve.dir;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;

/**
 * 通用数据文件解析器
 * 支持的读取方式：
 * 1. 以字节流的方式读取
 * 2. 一行行读取
 *
 * 重写
 * 注意：
 * 此类继承了@CommonFileLogger，开发者可以重写@CommonFileLogger中的相关方法来设置日志文件格式和存放位置
 *
 * com.nerve.jiepu.forensicsystem.parser
 * Created by zengxm on 2015/7/2 0002.
 */
public abstract class AbstractFileReader {

	protected Logger logger = LoggerFactory.getLogger(getClass());

	/**
	 * 文件的绝对路径
	 */
	protected String filePath;
	/**
	 * 每次读取的字节数
	 */
	protected int readSize;
	/**
	 * 读取次数的序号
	 */
	protected long readIndex;

	protected String encoding="utf-8";

	public AbstractFileReader()throws Exception{
		super();
	}

	/**
	 * 开始解析程序
	 */
	public void start() throws Exception{
		readByLine();
	}

	/**
	 * 一行一行地读取文件
	 * @throws Exception
	 */
	protected void readByLine()throws Exception{
		BufferedReader br = null;
		try{
			long startTime = System.currentTimeMillis();
			BufferedInputStream bis = new BufferedInputStream(new FileInputStream(filePath));
			br = new BufferedReader(new InputStreamReader(bis, getEncoding()), 10*1024*1024);      //10M的缓存

			String line = null;
			while((line=br.readLine())!=null){
				if(!onData(line)) break;
				readIndex++;
			}

			logger.info("parse complete[line model], time use "+(System.currentTimeMillis()-startTime)/100+" s, parse "+readIndex+" data line!");
			onEnd();
		}catch(Exception e){
			logger.error("error on parse file with line model",e);
			throw e;
		}finally {
			if(br!=null)
				br.close();
		}
	}

	/**
	 * 文件读取完成后要做的事情
	 * 这里不做任务动作，需要子类自己实现
	 */
	protected void onEnd(){
	}

	/**
	 * 每次读取到一定量的数据后，调用此方法
	 * @param line
	 * @return 返回false来停止文件的读取
	 */
	protected boolean onData(String line){
		return true;
	}

	public String getFilePath() {
		return filePath;
	}
	public void setFilePath(String filePath) {
		this.filePath = filePath;
	}

	/**
	 * 获取读取文件时使用的编码格式
	 * @return
	 */
	public String getEncoding() {
		return encoding;
	}

	/**
	 * 设置文件读取时的编码格式
	 * @param encoding
	 */
	public void setEncoding(String encoding) {
		/*
		如果设置为null，则尝试从文件中获取
		 */
		if(encoding == null){
			this.encoding = getCharsetOfFile(filePath);
			logger.debug("detected Encoding = {}", this.encoding);
		}
		else{
			this.encoding = encoding;
		}
		logger.debug("use Encoding {}", encoding);
	}

	/**
	 * 获取文件编码
	 * @param file 要分析的文件
	 **/
	public static String getCharsetOfFile(String file) {
		String charset = "GBK"; // 默认编码
		byte[] first3Bytes = new byte[3];
		BufferedInputStream bis = null;
		try {
			boolean checked = false;
			bis = new BufferedInputStream(new FileInputStream(file));
			bis.mark(0);
			int read = bis.read(first3Bytes, 0, 3);
			if (read == -1)
				return charset;
			if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
				charset = "UTF-16LE";
				checked = true;
			} else if (first3Bytes[0] == (byte) 0xEF
					&& first3Bytes[1] == (byte) 0xBB
					&& first3Bytes[2] == (byte) 0xBF) {
				charset = "UTF-8";
				checked = true;
			}
			bis.reset();
			if (!checked) {
				int loc = 0;
				while ((read = bis.read()) != -1) {
					loc++;
					if (read >= 0xF0)
						break;
					// 单独出现BF以下的，也算是GBK
					if (0x80 <= read && read <= 0xBF)
						break;
					if (0xC0 <= read && read <= 0xDF) {
						read = bis.read();
						if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF)
							// (0x80 -0xBF),也可能在GB编码内
							continue;
						else
							break;
						// 也有可能出错，但是几率较小
					} else if (0xE0 <= read && read <= 0xEF) {
						read = bis.read();
						if (0x80 <= read && read <= 0xBF) {
							read = bis.read();
							if (0x80 <= read && read <= 0xBF) {
								charset = "UTF-8";
								break;
							} else
								break;
						} else
							break;
					}
				}
//				System.out.println(loc + " " + Integer.toHexString(read));
			}
			bis.close();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (bis != null) {
				try {
					bis.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}
		return charset;
	}
}